import requests
import re
from urllib.parse import unquote
import json
from pprint import pprint
from DrissionPage import ChromiumPage
import os
folder_path = "D:\\DouyinDownload\\pic"

with open('urls.txt', 'r') as f:
    urls = f.readlines()

unique_urls = []
for url in urls:
    stripped_url = url.strip()
    if stripped_url not in unique_urls:
        unique_urls.append(stripped_url)
# url = 'https://www.xiaohongshu.com/explore/66d7cff7000000001e01bc29'
headers = {
    'cookie':'abRequestId=736449df-7a56-5745-b964-135ea07f1ce0; a1=19057f6577b6adtap20mksriq89zskocez5eagj1b50000117031; webId=865e5deaae925a394ca1c003bcc63797; gid=yj82WiK2iWd4yj82WiK2WukVWDK0f90UJ86T2yDMF3AYjd28uMTlS0888yyW8qy8DYSqifYD; xsecappid=xhs-pc-web; acw_tc=27b0044d67af996e652f1ac74b5098afdd7fd8a43a894d95a0e0c89a0db477ee; websectiga=634d3ad75ffb42a2ade2c5e1705a73c845837578aeb31ba0e442d75c648da36a; sec_poison_id=276cb4a7-460a-4118-8cd7-737cc52aeed5; web_session=0400698ef432c15677bb3d4728354bddc86f1d; webBuild=4.40.3; unread={%22ub%22:%22671758490000000016022f13%22%2C%22ue%22:%2267173e5f000000001b02f88e%22%2C%22uc%22:25}',
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}

for url in unique_urls:
    note_response = requests.get(url, headers=headers)
    note_html = note_response.text
    # print(note_html)

    note_title = re.findall('<title>(.*?) - 小红书</title>', note_html)[0]
    # 右滑换风格
    note_info = re.findall('<meta name="og:image" content="(.*?)">', note_html)
    # ['网址1','网址2']
    illegal_chars = r'[\\/:*?"<>|]'
    note_title = re.sub(illegal_chars,'',note_title)
    dir_name = folder_path + '\\' + note_title
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    else:
        continue
    print('正在爬取:',note_title,url)
    j = 0
    for i in note_info:
        pic_content = requests.get(i, headers=headers).content
        with open(dir_name + '\\' + str(j) + '.jpg', 'wb') as f:
            f.write(pic_content)
        j += 1

# note_response = requests.get(url, headers=headers)
# note_html = note_response.text
# # print(note_html)
#
# note_title = re.findall('<title>(.*?) - 小红书</title>', note_html)[0]
# # 右滑换风格
# note_info = re.findall('<meta name="og:image" content="(.*?)">', note_html)
# # ['网址1','网址2']
# dir_name = folder_path + '\\' + note_title
# if not os.path.exists(dir_name):
#     os.makedirs(dir_name)
# print('正在爬取:',note_title)
# j = 0
# for i in note_info:
#     pic_content = requests.get(i, headers=headers).content
#     with open(dir_name + '\\' + str(j) + '.jpg', 'wb') as f:
#         f.write(pic_content)
#     j += 1